Lendo os dados
resultados_avaliacoes = read_avaliacoes()
## Parsed with column specification:
## cols(
## id = col_character(),
## item = col_character(),
## municipio = col_character(),
## criterio = col_character(),
## aproach = col_character(),
## date = col_datetime(format = ""),
## valid = col_logical(),
## contNodeNumberAccess = col_double(),
## found = col_logical(),
## pathSought = col_character(),
## durationMin = col_double(),
## duration = col_double(),
## tipo_exp = col_character()
## )
resultados_avaliacoes[is.na(resultados_avaliacoes)] <- ""
gararito = read_gabaritos()
## Parsed with column specification:
## cols(
## municipio = col_character(),
## criterio = col_character(),
## item = col_character(),
## encontrado = col_logical(),
## local_encontrado = col_character(),
## local_encontrado_2 = col_character()
## )
gararito[is.na(gararito)] <- ""
empresas_portais <- readr::read_csv(here::here("data/empresas_portais.csv"))
## Warning: Missing column names filled in: 'X8' [8], 'X9' [9], 'X10' [10],
## 'X11' [11], 'X12' [12], 'X13' [13], 'X14' [14], 'X15' [15], 'X16' [16],
## 'X17' [17], 'X18' [18]
## Parsed with column specification:
## cols(
## municipio = col_character(),
## link_portal_transp = col_character(),
## link_prefeitura = col_character(),
## observacoes = col_character(),
## fornecedor = col_character(),
## tipo_fornecer = col_character(),
## `Fornecedor: Gestões Anteriores` = col_character(),
## X8 = col_character(),
## X9 = col_logical(),
## X10 = col_logical(),
## X11 = col_logical(),
## X12 = col_logical(),
## X13 = col_logical(),
## X14 = col_logical(),
## X15 = col_logical(),
## X16 = col_logical(),
## X17 = col_logical(),
## X18 = col_character()
## )
Adicionando combinação encontrada em cada município no gabarito
empresas_portais <- empresas_portais %>%
select(municipio, fornecedor)
gararito<-left_join(gararito, empresas_portais, by=c("municipio"))
Juntando Avaliações e Gabaritos
# concatena os dois csv o do gabarito e avaliações do crawler
data<-left_join(resultados_avaliacoes, gararito, by=c("municipio", "item", "criterio"))
Sumarizando as avaliações
precisao <- data %>%
group_by(municipio, criterio, item, aproach, date) %>%
mutate(
#verifica se a avaliação foi acertiva
tp = (valid == TRUE
& valid == encontrado
#valida se no gabarito e na avaliação o item foi encontrado na mesma url
& (grepl(local_encontrado, pathSought) |
grepl(local_encontrado_2, pathSought))) | (valid == FALSE
& valid == encontrado),
fn = valid == FALSE
& encontrado == TRUE,
fp = valid == TRUE
& encontrado == FALSE
)
precisao %>%
datatable(options = list(pageLength = 10), rownames = FALSE, class = 'cell-border stripe')
Quantificando métricas
metricas_result <- precisao %>%
#filter(!is.na(aproach )) %>%
group_by(municipio, aproach, date) %>%
summarise(
total_itens = n(),
tp_total = sum(tp),
fn_total = sum(fn),
fp_total = sum(fp),
#cálculo das métricas
recall = tp_total/(tp_total + fn_total),
precision = tp_total/(tp_total + fp_total),
f1_score = (2*(recall*precision))/(recall+precision),
#tempo das avaliações
median_duration_min = median(durationMin),
median_duration = median(duration),
max_duration = max(duration),
max_durationMin = max(durationMin),
median_num_access_node = median(contNodeNumberAccess),
max_num_access_node = max(contNodeNumberAccess),
all_access_node = sum(contNodeNumberAccess),
combination = last(fornecedor),
tipo_exp = last(tipo_exp)
)
metricas_result <- metricas_result %>%
filter(total_itens == 61 | total_itens == 48)
metricas_result %>%
write_csv(here::here("data/resultados_sumarizado.csv"))
metricas_result %>%
arrange(desc(recall))
## # A tibble: 102 x 19
## # Groups: municipio, aproach [85]
## municipio aproach date total_itens tp_total fn_total fp_total
## <chr> <chr> <dttm> <int> <int> <int> <int>
## 1 Alcantil bandit 2019-11-26 11:22:11 48 45 0 3
## 2 Alcantil bandit 2019-11-28 14:06:36 48 45 0 3
## 3 Alcantil dfs 2019-11-27 02:22:33 48 45 0 3
## 4 Cruz do … bandit 2019-11-28 02:20:04 48 45 0 3
## 5 Cruz do … dfs 2019-11-27 01:23:07 48 45 0 3
## 6 Pocinhos bandit 2019-11-25 21:35:00 48 45 0 3
## 7 Pocinhos bandit 2019-11-28 15:02:14 48 45 0 3
## 8 Pocinhos dfs 2019-11-27 02:03:07 48 45 0 3
## 9 Santa Ce… dfs 2019-11-27 23:53:02 48 45 0 3
## 10 Cruz do … dfs 2019-11-23 02:52:17 61 57 1 3
## # … with 92 more rows, and 12 more variables: recall <dbl>, precision <dbl>,
## # f1_score <dbl>, median_duration_min <dbl>, median_duration <dbl>,
## # max_duration <dbl>, max_durationMin <dbl>, median_num_access_node <dbl>,
## # max_num_access_node <dbl>, all_access_node <dbl>, combination <chr>,
## # tipo_exp <chr>
Avaliações por abordagem
metricas_result %>%
group_by(aproach) %>%
summarise(ocorrencia = n()) %>%
ggplot(aes(y=ocorrencia, x=reorder(aproach, +(ocorrencia)))) +
geom_bar(stat = "identity", fill="#5499C7") +
ggtitle("Número de Avaliações por Abordagem") +
xlab("Abordagem") +
ylab("Número de avaliações") +
coord_flip()

Avaliações por valor do Recall e Precision
metricas_result %>%
ggplot(aes(x=recall)) +
geom_histogram(alpha=0.5, position="identity", bins=20)

metricas_result %>%
ggplot(aes(x=precision)) +
geom_histogram(alpha=0.5, position="identity", bins=20)

metricas_result %>%
ggplot(aes(x=recall, color=aproach)) +
geom_histogram(fill='white', alpha=0.5, position="identity", bins=20) +
facet_grid(aproach ~ .)

metricas_result %>%
ggplot(aes(x=precision, color=aproach)) +
geom_histogram(fill='white', alpha=0.5, position="identity", bins=20) +
facet_grid(aproach ~ .)

Avaliações por tempo (Min)
metricas_result %>%
ggplot(aes(x=median_duration_min, color=aproach)) +
geom_histogram(fill='white', alpha=0.5, position="identity", bins=20) +
facet_grid(aproach ~ .)

metricas_result %>%
ggplot(aes(x=max_durationMin, color=aproach)) +
geom_histogram(fill='white', alpha=0.5, position="identity", bins=20) +
facet_grid(aproach ~ .)

Avaliações por número de nós acessados
metricas_result %>%
ggplot(aes(x=median_num_access_node, color=aproach)) +
geom_histogram(fill='white', alpha=0.5, position="identity", bins=20) +
facet_grid(aproach ~ .)

metricas_result %>%
ggplot(aes(x=max_num_access_node, color=aproach)) +
geom_histogram(fill='white', alpha=0.5, position="identity", bins=20) +
facet_grid(aproach ~ .)

Avaliações com Recall abaixo de 0.7
Todas as Avaliações
metricas_result %>%
group_by(municipio) %>%
summarise(bfs = sum(aproach == 'bfs'), dfs = sum(aproach == 'dfs'), bandit = sum(aproach == 'bandit'), tipo_exp=last(tipo_exp)) %>%
arrange(desc(dfs)) %>%
datatable(options = list(pageLength = 10), rownames = FALSE, class = 'cell-border stripe')
metricas_result %>%
select(municipio, aproach, date, recall, precision, f1_score) %>%
arrange(desc(recall)) %>%
datatable(options = list(pageLength = 30), rownames = FALSE, class = 'cell-border stripe')
Métricas
metricas_result %>%
ggplot() +
geom_point(aes(x=aproach, y=recall, color=aproach), position = "jitter")

metricas_result %>%
ggplot(aes(x='',y = recall)) +
geom_boxplot(fill = "white") +
geom_jitter(aes(color = aproach), alpha=0.5, size=3)

metricas_result %>%
ggplot(aes(x = aproach, y = recall)) +
geom_boxplot(fill = "orange")

metricas_result %>%
ggplot() +
geom_point(aes(x=aproach, y=f1_score, color=aproach), position = "jitter")

Tempo de Duração
metricas_result %>%
ggplot() +
geom_point(aes(x=aproach, y=median_duration_min, color=aproach), position = "jitter")

metricas_result %>%
ggplot() +
geom_point(aes(x=aproach, y=max_durationMin, color=aproach), position = "jitter")

metricas_result %>%
ggplot(aes(x='',y = max_durationMin)) +
geom_boxplot(fill = "white") +
geom_jitter(aes(color = aproach), alpha=0.5, size=3)

metricas_result %>%
ggplot(aes(x = aproach, y = max_durationMin)) +
geom_boxplot(fill = "orange")

Nós Acessados
metricas_result %>%
ggplot() +
geom_point(aes(x=aproach, y=max_num_access_node, color=aproach), position = "jitter")

metricas_result %>%
ggplot(aes(x='',y = max_num_access_node)) +
geom_boxplot(fill = "white") +
geom_jitter(aes(color = aproach), alpha=0.5, size=3)

metricas_result %>%
ggplot(aes(x = aproach, y = max_num_access_node), ) +
geom_boxplot(fill = "orange")

IC
set.seed(123)
f1_score <- function (d, i) {
dt<-d[i,]
c(
dt$f1_score
)
}
bootstraps <- boot(data = metricas_result,
statistic = f1_score, # <- referência para a função
R = 4000) # número de bootstraps
ci.tb = tidy(bootstraps,
conf.level = .95,
conf.method = "basic",
conf.int = TRUE)
glimpse(ci.tb)
## Observations: 102
## Variables: 5
## $ statistic <dbl> 0.9677419, 0.9677419, 0.9565217, 0.9677419, 0.9450549, 0.94…
## $ bias <dbl> -0.053225609, -0.054290969, -0.044000163, -0.054400912, -0.…
## $ std.error <dbl> 0.07454978, 0.07594377, 0.08290781, 0.08111319, 0.07258086,…
## $ conf.low <dbl> 0.9469781, 0.9469781, 0.9245377, 0.9469781, 0.9016041, 0.90…
## $ conf.high <dbl> 1.2296015, 1.2296015, 1.2071611, 1.2296015, 1.1842275, 1.18…
ci.tb %>%
ggplot(aes(x = "", y = statistic,
ymin = conf.low,
ymax = conf.high)) +
geom_pointrange() +
geom_point(size = 3) +
labs(y = "F1-score",
x = "") +
theme(axis.title = element_text(size=10))

#Calcula a media das posições escolhidas nas buscas.
set.seed(123)
f1_score_boot <- function (d, i) {
dt<-d[i,]
c(
dt$f1_score
)
}
boot.aproach <- metricas_result %>%
group_by(aproach) %>%
mutate(cors_boot = list(
boot(
data = metricas_result,
statistic = f1_score_boot,
R = 4000
)
)
)
ics.aproach <- boot.aproach %>%
group_by(aproach) %>%
summarise(
ci = list(tidy(cors_boot[[1]],
conf.level = .95,
conf.method = "basic",
conf.int = TRUE))
) %>%
unnest(ci)
ics.aproach %>%
ggplot(aes(x = aproach, y = statistic,
ymin = conf.low,
ymax = conf.high)) +
geom_pointrange() +
geom_point(size = 3) +
labs(y = "F1-score",
x = "") +
theme(axis.title = element_text(size=10))

metricas_result %>%
ggplot() +
geom_boxplot(aes(x=combination, y=max_num_access_node), fill = "white") +
geom_point(alpha = 0.4, aes(x=combination, y=max_num_access_node, color=aproach), position = "jitter") +
coord_flip()

metricas_result %>%
ggplot() +
geom_boxplot(aes(x=combination, y=max_durationMin), fill = "white") +
geom_point(alpha = 0.4, aes(x=combination, y=max_durationMin, color=aproach), position = "jitter") +
coord_flip()

metricas_result %>%
filter(combination == 'Publicsoft') %>%
ggplot() +
geom_boxplot(aes(x=combination, y=max_durationMin), fill = "white") +
geom_point(alpha = 0.4, aes(x=combination, y=max_durationMin, color=aproach), position = "jitter")
